package cn.edu.bjtu.abstractimpl.analyzer;

import java.io.IOException;
import java.io.Serializable;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;

import cn.edu.bjtu.abstractimpl.segment.AbstractDocumentSegmentation;
import cn.edu.bjtu.interfaces.WordSegResult;
import cn.edu.bjtu.interfaces.document.IDocument;



/**
 * 基于Lucence框架的SmartChinese分词实现
 * 这个类在Spark中也有相应代码,这里复制过来,让其实现了DocuemntSegmentation接口,其它没变
 * @author Alex
 *
 */
public class LuceneDocumentAnalyzer extends AbstractDocumentSegmentation implements Serializable{
	private static final long serialVersionUID = -9054868506622909536L;
	private final Analyzer analyzer;
	public LuceneDocumentAnalyzer() {
		super();
		analyzer = new SmartChineseAnalyzer(false);
	}
	public  String analyze(String content) {
		StringBuilder sb = new StringBuilder();
		try {
			TokenStream ts = analyzer.tokenStream("", new StringReader(content));
			ts.reset();
			ts.addAttribute(CharTermAttribute.class);

			while (ts.incrementToken()) {  
				CharTermAttributeImpl attr = (CharTermAttributeImpl) ts.getAttribute(CharTermAttribute.class);
				String word = attr.toString().trim();
                if(word.length()>1) {
                    if (sb.length() == 0) {
                        sb.append(word);
                    } else {
                        sb.append(" ").append(word);
                    }
                }
				ts.end();
			}
			ts.close();
		} catch (IOException e) {
			throw new RuntimeException("", e);
		} finally {
		}
		return sb.toString();
	}
	@Override
	public String segment(IDocument doc) {
		return analyze(doc.getContent());
	}
	/* (non-Javadoc)
	 * @see cn.edu.bjtu.wordseg.DocumentSegmentation#segment(java.lang.String)
	 */
	@Override
	public String segment(String doc) {
		// TODO Auto-generated method stub
		return analyze(doc);
	}
	@Override
	public WordSegResult segmentExtend(final IDocument doc) {
		// TODO Auto-generated method stub
		return new WordSegResult() {
			public String toString(){
				return LuceneDocumentAnalyzer.this.segment(doc.getContent());
			}
		};
	}

	@Override
	public WordSegResult segmentExtend(final String doc) {
		return new WordSegResult() {
			public String toString(){
				return LuceneDocumentAnalyzer.this.analyze(doc);
			}
		};
	}

}